MODEL COMPUTATION (reference and validation data pre processing)

Library loading, function definition and gene annotation

library(limma)
library(DESeq2)
library(org.Hs.eg.db)
library(readxl)
library(reshape2)
library(ggplot2)
library(ggfortify)
library(ggpubr)
library(caret)
library(steFunctions)  #personal package, performs quantile normalization
library(EnsDb.Hsapiens.v79)
library(EnsDb.Hsapiens.v75)
library(heatmap3)
library(tidyverse)
library(ComplexHeatmap)
library(circlize)
library(doMC)
library(pROC)

#define function to get color gradient 

getGradientColors <- function(colValue, minRange = NULL, maxRange = NULL, ccolors = c('green3', 'red'), bbreak = 0.01){
  library(heatmap3)
  if(is.null(minRange)){
    minRange <- min(colValue, na.rm = T) - 0.001
  }
  if(is.null(maxRange)){
    maxRange <- max(colValue, na.rm = T) + 0.001
  }
  bre <- seq(minRange, maxRange, bbreak) 
  colori <- colByValue(colValue, col = colorRampPalette(ccolors)(length(bre)-1), breaks= bre, cex.axis = 0.8)
  colori <- as.vector(colori)
  graphics.off()
  names(colori) <- names(colValue)
  return(colori)
}

MEDIAsave="/home/mclaudia/MEDIA Project/Codici_finali/"
wdd="/home/mclaudia/MEDIA Project/OAStratification-pipeline/"
MEDIAgraph="/home/mclaudia/MEDIA Project/Codici_finali/Paper_figures/"

load(paste0(MEDIAsave,"genide-bulk-unpaired_DE.RData"),verbose=TRUE) #for TPM duplicate fix
## Loading objects:
##   OAvsnonOA
up_genes <- read_excel(paste0(MEDIAsave,"up_dw_genes_consensus.xlsx"), sheet=1)
dw_genes <- read_excel(paste0(MEDIAsave,"up_dw_genes_consensus.xlsx"), sheet=2)

ensdf <- GenomicFeatures::genes(EnsDb.Hsapiens.v79, return.type="DataFrame")
ens2gene <- as.data.frame(ensdf[,c("gene_id","gene_name")])

ensdf19 <- GenomicFeatures::genes(EnsDb.Hsapiens.v75, return.type="DataFrame")
ens2gene19 <- as.data.frame(ensdf19[,c("gene_id","gene_name")])

We use as reference Dataset 3 (Unpaired)

TPM matrix has been collected from txi-import file from Soul et al., 2017

load(paste0(wdd,"data/txi.RData"))

patientDetails<-read.csv(paste0(wdd,"data/patientDetails_all_withMed.csv"),stringsAsFactors = F)
patientDetails <-patientDetails[,-3]
patientDetails<-patientDetails[!duplicated(patientDetails$name),]
patientDetails$OA<-ifelse(grepl("SH0", patientDetails$name),"OA","NonOA")

tmp <- txi$abundance[,match(unique(as.character(patientDetails$name)),colnames(txi$abundance))]  #collect TPM matrix
keep <- rowSums(tmp > 0) >= 35   #TPMs > 0 in at least 50% of the samples
tmp <-tmp[keep,]

tmp <-cbind(gene_id=rownames(tmp), tmp) 
tmp<-merge(tmp,ens2gene,by.y="gene_id")
tmp <-cbind(tmp$gene_name, tmp[,-72])
colnames(tmp)[1] <-"gene_name"
rownames(tmp) <-tmp$gene_id

dupp <-tmp[duplicated(tmp$gene_name),]
gn_dup <-unique(dupp[,1]) 

tpm.mat_maintain <-tmp[!tmp$gene_name %in% gn_dup,] 

tpm2 <-as.data.frame(matrix(NA, length(gn_dup),ncol(tpm.mat_maintain[,-c(1,2)])))
colnames(tpm2) <-colnames(tpm.mat_maintain)[-c(1,2)]
rownames(tpm2) <-paste0("ENS_",1:length(gn_dup))

for(i in 1:length(gn_dup)){
  x=gn_dup[i]
  y=tmp[tmp[,1] %in% x,-c(1,2)]
  tpm2[i,1:ncol(tpm2)] <-apply(y,2,function(x) round(mean(as.numeric(x)),3))
}

tpm2 <-cbind(gene_name=gn_dup,tpm2) 
tpm2 <-tpm2[,colnames(tpm.mat_maintain)[-2]]
tpm3 <-rbind(tpm.mat_maintain[,-2],as.matrix(tpm2))

rownames(tpm3) <-tpm3[,1]

tpm.mat <-t(apply(tpm3[,-1],1,as.numeric))
colnames(tpm.mat) <-colnames(tpm3)[-1]
dim(tpm.mat)
## [1] 17804    70
train_tpm <-log2(tpm.mat+0.001)
boxplot(train_tpm)

Load Validation dataset from GSE114007 (see vignette 4)

load(paste0(MEDIAsave,"datasetValidation_full.RData"),verbose = TRUE)
## Loading objects:
##   newdataset

Download gene length for TPM trasformation for the Validation dataset

tmp<-newdataset
genes <- rownames(tmp)


geneAnn <-ens2gene19[ens2gene19$gene_name %in% genes,]
out <- as.data.frame(EDASeq::getGeneLengthAndGCContent(id = geneAnn$gene_id, org = 'hg19',mode = "org.db"))
out$gene_id <-rownames(out)
out <-merge(geneAnn,out,by="gene_id")
out <-out[complete.cases(out),]
out <-out[!duplicated(out$gene_name),]  
rownames(out) <-out$gene_name
gene_length <-out[,3,drop=FALSE]
tmp <-tmp[rownames(gene_length),]
dim(tmp)
## [1] 20685    38
identical(rownames(tmp),rownames(gene_length)) #TRUE
## [1] TRUE
y <- apply(tmp,2,function(x) x/as.numeric(gene_length[,1]))
tpm.mat2 <- t((t(y) * 1e6)/colSums(y))
head(colSums(tpm.mat2))
## Normal_Cart_10_8  Normal_Cart_2_2  Normal_Cart_3_3  Normal_Cart_4_4 
##            1e+06            1e+06            1e+06            1e+06 
##  Normal_Cart_5_5  Normal_Cart_6_6 
##            1e+06            1e+06
keep <- rowSums(tpm.mat2 > 0) >= 19   #TPMs > 0 in at least 50% of the samples
tpm.mat2 <-tpm.mat2[keep,]


validation_tpm <-log2(tpm.mat2+0.001)
rm(y)
boxplot(validation_tpm)

Rescale validation TPM-log2 data based on reference

1. Intersect reference genes with the ones of the sample of interest (or validation dataset)

2. quantile normalize the reference dataset

ii=intersect(rownames(train_tpm),rownames(validation_tpm)) #select common genes,14565

train_tpm <-train_tpm[ii,]
validation_tpm <-validation_tpm[ii,]

dim(train_tpm)
## [1] 14565    70
dim(validation_tpm)
## [1] 14565    38
train_tpm <-t(quantileNormalization(t(train_tpm)))

list_ranks <- as.numeric(sort(train_tpm[,1],decreasing=TRUE))

head(list_ranks)
## [1] 15.90471 14.80488 14.42682 14.01457 13.75508 13.53887

3. rescale the sample of interest for the quantile interval (or validation dataset)

list_val <-vector("list",ncol(validation_tpm))
names(list_val) <-colnames(validation_tpm)

list_val <-apply(validation_tpm, 2, function(x) as.list(sort(x,decreasing = TRUE)))

list_valRank <-vector("list",ncol(validation_tpm))
names(list_valRank) <-colnames(validation_tpm)


for(i in 1:length(list_val)){
  gg <-names(unlist(list_val[[i]]))
  list_valRank[[i]] <-list_ranks
  names(list_valRank[[i]]) <- gg
}

normdata2 <-matrix(NA,nrow = nrow(validation_tpm), ncol=ncol(validation_tpm))
rownames(normdata2) <-rownames(validation_tpm)
colnames(normdata2) <-colnames(validation_tpm)

for(i in 1:length(list_valRank)){
  tmp <-as.data.frame(list_valRank[[i]])
  rownames(tmp) <-names(list_valRank[[i]])
  normdata2[,i] <-tmp[rownames(normdata2),]
}
boxplot(normdata2)  #Validation re-scaled on quantile normalized reference

ELASTIC NET model (select most informative features)

  1. Run the model 100 times (100 x 20-elements samples, 10 random OA and 10 healthy samples)
  2. After, select the features occurred at least the 50% of times
  3. Compute mean coefficient values
  4. Compute scores per patient (linear combination) with the selected features gene expression TPM values
train_tpm_forRidge <- train_tpm
validation_tpm_forRidge <- normdata2

validation_tpm <-normdata2[rownames(normdata2) %in% c(up_genes$gene,dw_genes$gene),] #select consensus genes present
dim(validation_tpm)
## [1] 43 38
train_tpm <-train_tpm[rownames(train_tpm) %in% rownames(validation_tpm),]  
dim(train_tpm)
## [1] 43 70

Set 100 OA sample runs: balance data for normal samples (bootstrap)

set.seed(7894)
runs <-vector("list",100)
names(runs) <-paste0("run_",1:100)
runs <-lapply(runs,function(x) x <- sample(11:70, 10, replace = FALSE))

subsets <-vector("list",100)
names(subsets) <-paste0("run_",1:100)

for(i in 1:length(subsets)) subsets[[i]] <-train_tpm[,c(1:10,as.numeric(runs[[i]]))]  #create subsets

Train

Tune the model

trainCtrl.rpcv <- trainControl(method = "LOOCV", number = 1,
                               verboseIter = TRUE, returnData = FALSE,
                               savePredictions = TRUE,        
                               classProbs = TRUE)

ClassBoot=as.factor(c(rep("N",10),rep("OA",10)))

list_tunes <-vector("list",100)
names(list_tunes) <-paste0("tunes_",1:100)

Train the model for 100 runs

nCores <- 150 #change number of threads if needed

registerDoMC(nCores)
for(i in 1:length(subsets)){
  train_dataset <-t(subsets[[i]])
  train <- train(train_dataset,
                           y = ClassBoot,
                           method = "glmnet", 
                           trControl = trainCtrl.rpcv,
                           metric = "Accuracy", allowParallel=TRUE,
                           tuneGrid = expand.grid(alpha = 0.5,  #for elastic net, median between 0 and 1
                                                  lambda = seq(0.001,1,by = 0.001)))
  list_tunes[[i]] <- coef(train$finalModel, train$finalModel$lambdaOpt)
}

Assess presence of each feature for each run

table_occurrence <-matrix(0,100,length(rownames(train_tpm)))
colnames(table_occurrence) <-rownames(train_tpm)
rownames(table_occurrence) <-paste0("run_",1:100)

for(i in 1:nrow(table_occurrence)){
  tmp=list_tunes[[i]][,1][-1]
  table_occurrence[i,names(tmp)] <-tmp
}
meanvalues <-table_occurrence

table_occurrence[table_occurrence != 0] <-1

tbO <-melt(table_occurrence)

ggplot(tbO, aes(y=Var2, x=Var1, fill=factor(value,levels = c("1","0"))))+
  geom_tile()+
  theme(axis.text.y = element_text(size=13))+
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1,size=7))+
  xlab("Runs")+
  ylab("Genes")+
  scale_fill_manual(name="value",values=c("1"="red", "0"="blue"))

(selected <-sort(colSums(table_occurrence)[colSums(table_occurrence) >= 50],decreasing = TRUE))
## TNFSF11   LOXL3    DNER  TSPAN2   THBS3    ASPN   HTRA1    DYSF 
##     100      98      98      94      94      85      68      62

Select features with at least the 50% of occurrence and compute mean coefficient values across the runs

selected1 <-names(selected)

meanvalues <-meanvalues[,selected1]
(cf <-colMeans(meanvalues))
##    TNFSF11      LOXL3       DNER     TSPAN2      THBS3       ASPN      HTRA1 
## 0.14553148 0.08586524 0.17203360 0.04224252 0.10231979 0.03194829 0.02917054 
##       DYSF 
## 0.03668216
barplot(sort(cf,decreasing = TRUE), col = "dodgerblue", cex.names=1.2,font = 2, yaxt = "n") 
axis(side = 2) 

Compute score (sR) for Reference dataset , exploring also the distribution of this score across the patients

ClassTRfull=as.factor(c(rep("N",10),rep("OA",60)))
score_df <-data.frame(Patient=colnames(train_tpm), Score=NA, Class=ClassTRfull)

train_sc <-t(train_tpm[names(cf),])

sc <-rowSums(t(apply(train_sc,1,function(x) x*cf))) #compute score

rownames(score_df) <-score_df[,1]
score_df <-score_df[names(sc),]
score_df$Score <-sc

score_df_train <-score_df
rm(score_df)

ggplot(score_df_train, aes(x=Score,fill=Class)) +
  geom_density(alpha=.7)+
  xlab("score R")+
  scale_fill_manual(values = c("plum","cyan"))+
  theme_bw()

ggplot(score_df_train, aes(y=Score,x=Class, fill=Class))+
  geom_boxplot(alpha=.6)+
  stat_compare_means(cex=7,label.x=0.75,label.y = 3)+
  geom_jitter((aes(colour = Class))) +
  ylab("score R")+
  scale_fill_manual(values = c("plum","cyan"))+
  scale_color_manual(values = c("deeppink","turquoise"))+
  theme_bw()+
  theme(axis.text.y = element_text(size = 15),
        axis.text.x = element_text(size = 15),
        legend.title=element_text(size=16), 
        legend.text=element_text(size=14))+
  guides(color = guide_legend(override.aes = list(size = 2)))

How does the sR discriminate between healthy and affected patients? REFERENCE

col1 <-getGradientColors(score_df_train$Score, ccolors = c("deepskyblue1","dodgerblue","dodgerblue3","navy"), maxRange = max(score_df_train$Score)+ 0.01, minRange = min(score_df_train$Score)-0.01)

score_df_train$col1 <-col1
score_df_train <-score_df_train[order(score_df_train$Patient),]

score_df_train$col2 <-c(rep("plum",10),rep("cyan",60))

sum(up_genes$gene %in% rownames(train_tpm))
## [1] 38
sum(dw_genes$gene %in% rownames(train_tpm))
## [1] 5
sel <-c(up_genes$gene,dw_genes$gene)[c(up_genes$gene,dw_genes$gene) %in%  rownames(train_tpm)]  #43
train_tpm <-train_tpm[sel,] 

annGenes <-data.frame(genes=sel, col3=c(rep("red",38),rep("blue",5)))

summ=summary(score_df_train$Score)
col_fun = colorRamp2(c(summ[1],summ[2],summ[5],summ[6]), c("#00BFFF","#197CDD","#0F4DB3","#000080"))

score_df_train <-score_df_train[order(score_df_train$Score,decreasing = FALSE),]  #order the patients for increasing Score
head(score_df_train)
##        Patient    Score Class    col1 col2
## MRI009  MRI009 1.340591     N #00BFFF plum
## MRI008  MRI008 1.443142     N #03B9FF plum
## MRI002  MRI002 1.489666     N #04B7FF plum
## MRI005  MRI005 1.532074     N #06B5FF plum
## MRI004  MRI004 1.672486     N #0AAEFF plum
## MRI006  MRI006 1.769805     N #0DAAFF plum

Heatmap: z-score computation of the data with patients sorted by increasing score

train_tpm_scal <-t(apply(train_tpm,1,scale))
rownames(train_tpm_scal) <-rownames(train_tpm)
colnames(train_tpm_scal) <-colnames(train_tpm)

train_tpm_scal[train_tpm_scal < -3] <- -3
train_tpm_scal[train_tpm_scal > 3] <- 3

heatmap3(train_tpm_scal[annGenes$genes,rownames(score_df_train)],
         Colv=NA,
         scale="none",
         col=colorRampPalette(c("royalblue", "white", "red1"))(1024),
         RowAxisColors = 1,
         balanceColor = TRUE,
         margins = c(15,15),
         ColSideColors = as.matrix(score_df_train[,c(4,5)]),
         ColSideLabs = c("score R","Status"),
         RowSideColors = annGenes$col3,
         RowSideLabs = "Consensus",
         cexRow = 1, cexCol = 1)
legend(0.4,1,legend=c("OA","Normal"),cex=2,
       fill=c("cyan","plum"), bty = "n", title = "Class")
legend(0.5,1,legend=c("UP","DOWN"),cex=2,
       fill=c("red","blue"), bty = "n", title = "DE")
draw(Legend(col_fun = col_fun, title = "score R"), x = unit(19, "in"), y = unit(13, "in"))

Compute score (sR) for Validation dataset, exploring also the distribution of this score across the patients

ClassV=factor(c(rep("N",18),rep("OA",20)))
score_df <-data.frame(Patient=colnames(validation_tpm), Score=NA, Class=ClassV)

val_sc <-t(validation_tpm[names(cf),])
identical(names(cf), colnames(val_sc)) #TRUE
## [1] TRUE
sc <-rowSums(t(apply(val_sc,1,function(x) x*cf)))

rownames(score_df) <-score_df[,1]
score_df <-score_df[names(sc),]
score_df$Score <-sc
head(sc)
## Normal_Cart_10_8  Normal_Cart_2_2  Normal_Cart_3_3  Normal_Cart_4_4 
##         2.665540         3.051013         2.073371         3.084911 
##  Normal_Cart_5_5  Normal_Cart_6_6 
##         1.887718         2.122148
score_df_val <-score_df
rm(score_df)

ggplot(score_df_val, aes(x=Score,fill=Class)) +
  geom_density(alpha=.7)+
  xlab("score R")+
  scale_fill_manual(values = c("plum","cyan"))+
  theme_bw()

ggplot(score_df_val, aes(y=Score,x=Class, fill=Class))+
  geom_boxplot(alpha=.6)+
  stat_compare_means(cex=7,label.x=0.75,label.y = 4)+
  geom_jitter((aes(colour = Class))) +
  ylab("score R")+
  scale_fill_manual(values = c("plum","cyan"))+
  scale_color_manual(values = c("deeppink","turquoise"))+
  theme_bw()+
  theme(axis.text.y = element_text(size = 15),
        axis.text.x = element_text(size = 15),
        legend.title=element_text(size=16), 
        legend.text=element_text(size=14))+
  guides(color = guide_legend(override.aes = list(size = 2)))

How does the sR discriminate between healthy and affected patients? VALIDATION

col1 <-getGradientColors(score_df_val$Score, ccolors = c("deepskyblue1","dodgerblue","dodgerblue3","navy"), maxRange = max(score_df_val$Score)+ 0.01, minRange = min(score_df_val$Score)-0.01)
score_df_val$col1 <-col1
score_df_val <-score_df_val[order(score_df_val$Patient),]

score_df_val$col2 <-c(rep("plum",18),rep("cyan",20))

validation_tpm <-validation_tpm[sel,] 
summ2=summary(score_df_val$Score)
col_fun2 = colorRamp2(c(summ2[1],summ2[2],summ2[5],summ2[6]), c("#00BFFF","#1C88F2","#0D40AB","#000080"))

score_df_val <-score_df_val[order(score_df_val$Score,decreasing = FALSE),] #order the patients for increasing Score
head(score_df_val)
##                         Patient     Score Class    col1 col2
## normal_06             normal_06 0.6873741     N #00BFFF plum
## normal_02             normal_02 1.1900648     N #0BACFF plum
## normal_04             normal_04 1.2291431     N #0CABFF plum
## normal_10             normal_10 1.4077748     N #10A5FF plum
## Normal_Cart_5_5 Normal_Cart_5_5 1.8877178     N #1B93FF plum
## normal_07             normal_07 1.9662685     N #1D91FF plum

Heatmap: z-score computation of the data with patients sorted by increasing score

validation_tpm_scal <-t(apply(validation_tpm,1,scale))
rownames(validation_tpm_scal) <-rownames(validation_tpm)
colnames(validation_tpm_scal) <-colnames(validation_tpm)

validation_tpm_scal[validation_tpm_scal < -3] <- -3
validation_tpm_scal[validation_tpm_scal > 3] <- 3

heatmap3(validation_tpm_scal[annGenes$genes,rownames(score_df_val)],
         Colv=NA,
         scale="none",
         col=colorRampPalette(c("royalblue", "white", "red1"))(1024),
         RowAxisColors = 1,
         balanceColor = TRUE,
         margins = c(10,10),
         ColSideColors = as.matrix(score_df_val[,c(4,5)]),
         ColSideLabs = c("score R","Status"),
         RowSideColors = annGenes$col3,
         RowSideLabs = "Consensus",
         cexRow = 1, cexCol = 1)
legend(0.4,1,legend=c("OA","Normal"),cex=2,
       fill=c("cyan","plum"), bty = "n", title = "Class")
legend(0.5,1,legend=c("UP","DOWN"),cex=2,
       fill=c("red","blue"), bty = "n", title = "DE")
draw(Legend(col_fun = col_fun2, title = "score R"), x = unit(19, "in"), y = unit(13, "in"))

Performance analysis: ROC curve on Validation dataset for sR score

score_df_val <-score_df_val[order(score_df_val$Class),]
rocobj <- roc(score_df_val$Class,score_df_val$Score) 

auc <-round(rocobj$auc,3)

ggroc(rocobj, size = 2, col="dodgerblue",legacy.axes = TRUE) +
  theme_bw()+
  ggtitle(paste0('ROC Curve for score R (sR) on Validation dataset ', '(AUC = ', auc, ')'))+
  labs(x = "1 - Specificity",y = "Sensitivity")

rocobj_val <-rocobj

RIDGE model (all the available features,43)

Train

Tune the model

validation_tpm <-validation_tpm_forRidge[rownames(validation_tpm_forRidge) %in% c(up_genes$gene,dw_genes$gene),]
train_tpm <-train_tpm_forRidge[rownames(train_tpm_forRidge) %in% rownames(validation_tpm),]  

trainCtrl.rpcv <- trainControl(method = "LOOCV", number = 1,
                               verboseIter = TRUE, returnData = FALSE,
                               savePredictions = TRUE,        
                               classProbs = TRUE)

Train the model and save the coefficients

nCores <- 100 #change number if needed
registerDoMC(nCores)

train_dataset <-t(train_tpm)
train <- train(train_dataset,
                 y = ClassTRfull,
                 method = "glmnet", 
                 trControl = trainCtrl.rpcv,
                 metric = "Accuracy", allowParallel=TRUE,
                 tuneGrid = expand.grid(alpha = 0, #ridge
                                        lambda = seq(0.001,1,by = 0.001)))
cf_all <-coef(train$finalModel, train$finalModel$lambdaOpt)[,1][-1]

Compute score (sT) for Reference dataset using coefficients from the Ridge model

score_df_all_tr <-data.frame(Patient=colnames(train_tpm), Score=NA, Class=ClassTRfull)

train_sc_all <-t(train_tpm[names(cf_all),])

sc1 <-rowSums(t(apply(train_sc_all,1,function(x) x*cf_all))) 

rownames(score_df_all_tr) <-score_df_all_tr[,1]
score_df_all_tr <-score_df_all_tr[names(sc1),]
score_df_all_tr$Score <-sc1
head(sc1)
##   MRI001   MRI002   MRI003   MRI004   MRI005   MRI006 
## 7.028785 2.452862 4.865943 3.555113 4.605602 3.928878

Compute score (sT) for Validation dataset using coefficients from the Ridge model

score_df_all_val <-data.frame(Patient=colnames(validation_tpm), Score=NA, Class=ClassV)

val_sc_all <-t(validation_tpm[names(cf_all),])

sc <-rowSums(t(apply(val_sc_all,1,function(x) x*cf_all))) 

rownames(score_df_all_val) <-score_df_all_val[,1]
score_df_all_val <-score_df_all_val[names(sc),]
score_df_all_val$Score <-sc
head(sc)
## Normal_Cart_10_8  Normal_Cart_2_2  Normal_Cart_3_3  Normal_Cart_4_4 
##         5.669915         7.437420         3.869674         6.863030 
##  Normal_Cart_5_5  Normal_Cart_6_6 
##         5.256593         6.175890

Performance analysis: ROC curve on Validation dataset for sT score

score_df_all_val <-score_df_all_val[order(score_df_all_val$Score, decreasing=FALSE),]

rocobj_val_all<- roc(score_df_all_val$Class,score_df_all_val$Score) 

auc <-round(rocobj_val_all$auc,3)

ggroc(rocobj_val_all, size = 2, col="dodgerblue", legacy.axes = TRUE) +
  theme_bw()+
  ggtitle(paste0('ROC Curve for score T (sT) on Validation dataset ', '(AUC = ', auc, ')'))+
  labs(x = "1 - Specificity",y = "Sensitivity")

Compare the ROC curves on validation dataset to assess the efficacy of the sR prediction and apply DeLong’s test

list_rocs <-list(rocobj_val,rocobj_val_all)
names(list_rocs) <-c("Score R","Score T")

auc <- lapply(list_rocs, function(x) round(x$auc,3))

auc[1]
## $`Score R`
## [1] 0.875
auc[2]
## $`Score T`
## [1] 0.922
(rocts <-roc.test(rocobj_val,rocobj_val_all))
## 
##  DeLong's test for two ROC curves
## 
## data:  rocobj_val and rocobj_val_all
## D = -0.66741, df = 69.259, p-value = 0.5067
## alternative hypothesis: true difference in AUC is not equal to 0
## sample estimates:
## AUC of roc1 AUC of roc2 
##   0.8750000   0.9222222
ggroc(list_rocs, size = 2,legacy.axes = TRUE) +
    theme_bw()+
    annotate("text", 0.75, 0.44,  label=expression("AUC 8 features - score s"["R"] : 0.875) ,size=7)+
    annotate("text", 0.75,0.4, label=expression("AUC 43 features - score s"["T"] : 0.922) ,size=7)+
    labs(x = "1 - Specificity",y = "Sensitivity",color='Models')+
    theme(legend.title=element_text(size=16),legend.text=element_text(size=14))+
       guides(Models = guide_legend(override.aes = list(size = 4)))

Save resulting scores for the two models and the two datasets

list_scores <-list(score_df_all_tr, score_df_all_val, score_df_train, score_df_val)
names(list_scores) <-c("scoreT Training","scoreT Valid","scoreR Training","scoreR Valid")
save(list_scores,file=paste0(MEDIAsave,"list_scores.RData"))

Session Info

## R version 4.2.1 (2022-06-23)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.5 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
## LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/liblapack.so.3
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
##  [1] grid      parallel  stats4    stats     graphics  grDevices utils    
##  [8] datasets  methods   base     
## 
## other attached packages:
##  [1] BSgenome.Hsapiens.UCSC.hg19_1.4.3      
##  [2] BSgenome_1.64.0                        
##  [3] rtracklayer_1.56.1                     
##  [4] Biostrings_2.64.1                      
##  [5] XVector_0.36.0                         
##  [6] TxDb.Hsapiens.UCSC.hg19.knownGene_3.2.2
##  [7] pROC_1.18.4                            
##  [8] doMC_1.3.8                             
##  [9] iterators_1.0.14                       
## [10] foreach_1.5.2                          
## [11] circlize_0.4.15                        
## [12] ComplexHeatmap_2.12.1                  
## [13] lubridate_1.9.2                        
## [14] forcats_1.0.0                          
## [15] stringr_1.5.0                          
## [16] dplyr_1.1.3                            
## [17] purrr_1.0.1                            
## [18] readr_2.1.4                            
## [19] tidyr_1.3.0                            
## [20] tibble_3.2.1                           
## [21] tidyverse_2.0.0                        
## [22] heatmap3_1.1.9                         
## [23] EnsDb.Hsapiens.v75_2.99.0              
## [24] EnsDb.Hsapiens.v79_2.99.0              
## [25] ensembldb_2.20.2                       
## [26] AnnotationFilter_1.20.0                
## [27] GenomicFeatures_1.48.4                 
## [28] steFunctions_2017.01.20                
## [29] clue_0.3-64                            
## [30] caret_6.0-94                           
## [31] lattice_0.21-8                         
## [32] ggpubr_0.6.0                           
## [33] ggfortify_0.4.16                       
## [34] ggplot2_3.4.3                          
## [35] reshape2_1.4.4                         
## [36] readxl_1.4.3                           
## [37] org.Hs.eg.db_3.15.0                    
## [38] AnnotationDbi_1.58.0                   
## [39] DESeq2_1.36.0                          
## [40] SummarizedExperiment_1.26.1            
## [41] Biobase_2.56.0                         
## [42] MatrixGenerics_1.8.1                   
## [43] matrixStats_1.0.0                      
## [44] GenomicRanges_1.48.0                   
## [45] GenomeInfoDb_1.32.4                    
## [46] IRanges_2.30.1                         
## [47] S4Vectors_0.34.0                       
## [48] BiocGenerics_0.42.0                    
## [49] limma_3.52.4                           
## 
## loaded via a namespace (and not attached):
##   [1] utf8_1.2.3               R.utils_2.12.2           tidyselect_1.2.0        
##   [4] RSQLite_2.3.1            BiocParallel_1.30.4      munsell_0.5.0           
##   [7] codetools_0.2-19         interp_1.1-4             future_1.33.0           
##  [10] withr_2.5.0              colorspace_2.1-0         filelock_1.0.2          
##  [13] highr_0.10               knitr_1.43               rstudioapi_0.15.0       
##  [16] ggsignif_0.6.4           listenv_0.9.0            labeling_0.4.3          
##  [19] GenomeInfoDbData_1.2.8   hwriter_1.3.2.1          farver_2.1.1            
##  [22] bit64_4.0.5              parallelly_1.36.0        vctrs_0.6.3             
##  [25] generics_0.1.3           ipred_0.9-14             xfun_0.39               
##  [28] timechange_0.2.0         BiocFileCache_2.4.0      fastcluster_1.2.3       
##  [31] EDASeq_2.30.0            R6_2.5.1                 doParallel_1.0.17       
##  [34] locfit_1.5-9.8           bitops_1.0-7             cachem_1.0.8            
##  [37] DelayedArray_0.22.0      BiocIO_1.6.0             scales_1.2.1            
##  [40] nnet_7.3-18              gtable_0.3.4             globals_0.16.2          
##  [43] timeDate_4022.108        rlang_1.1.1              genefilter_1.78.0       
##  [46] GlobalOptions_0.1.2      splines_4.2.1            rstatix_0.7.2           
##  [49] lazyeval_0.2.2           ModelMetrics_1.2.2.2     broom_1.0.5             
##  [52] yaml_2.3.7               abind_1.4-5              backports_1.4.1         
##  [55] tools_4.2.1              lava_1.7.2.1             jquerylib_0.1.4         
##  [58] RColorBrewer_1.1-3       Rcpp_1.0.11              plyr_1.8.8              
##  [61] progress_1.2.2           zlibbioc_1.42.0          RCurl_1.98-1.12         
##  [64] prettyunits_1.1.1        deldir_1.0-9             rpart_4.1.19            
##  [67] GetoptLong_1.0.5         cluster_2.1.4            magrittr_2.0.3          
##  [70] data.table_1.14.8        ProtGenerics_1.28.0      aroma.light_3.26.0      
##  [73] hms_1.1.3                evaluate_0.21            xtable_1.8-4            
##  [76] XML_3.99-0.14            jpeg_0.1-10              gridExtra_2.3           
##  [79] shape_1.4.6              compiler_4.2.1           biomaRt_2.52.0          
##  [82] crayon_1.5.2             R.oo_1.25.0              htmltools_0.5.6         
##  [85] tzdb_0.4.0               geneplotter_1.74.0       DBI_1.1.3               
##  [88] dbplyr_2.3.3             MASS_7.3-58.3            rappdirs_0.3.3          
##  [91] ShortRead_1.54.0         Matrix_1.5-4.1           car_3.1-2               
##  [94] cli_3.6.1                R.methodsS3_1.8.2        gower_1.0.1             
##  [97] pkgconfig_2.0.3          GenomicAlignments_1.32.1 recipes_1.0.8           
## [100] xml2_1.3.4               annotate_1.74.0          bslib_0.5.1             
## [103] hardhat_1.3.0            prodlim_2023.08.28       digest_0.6.32           
## [106] rmarkdown_2.24           cellranger_1.1.0         restfulr_0.0.15         
## [109] curl_5.0.1               Rsamtools_2.12.0         rjson_0.2.21            
## [112] lifecycle_1.0.3          nlme_3.1-162             jsonlite_1.8.7          
## [115] carData_3.0-5            fansi_1.0.4              pillar_1.9.0            
## [118] KEGGREST_1.36.3          fastmap_1.1.1            httr_1.4.7              
## [121] survival_3.5-5           glue_1.6.2               png_0.1-8               
## [124] bit_4.0.5                class_7.3-21             stringi_1.7.12          
## [127] sass_0.4.7               blob_1.2.4               latticeExtra_0.6-30     
## [130] memoise_2.0.1            future.apply_1.11.0